#include "arm_arch.h"

#if __ARM_MAX_ARCH__>=7
.arch   armv8-a+crypto
.text
.section        .rodata
.align  5
.Lrcon:
.long   0x01,0x01,0x01,0x01
.long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
.long   0x1b,0x1b,0x1b,0x1b
.previous
.globl  aes_v8_set_encrypt_key
.type   aes_v8_set_encrypt_key,%function
.align  5
aes_v8_set_encrypt_key:
.Lenc_key:
        AARCH64_VALID_CALL_TARGET
        // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
        stp     x29,x30,[sp,#-16]!
        add     x29,sp,#0
        mov     x3,#-1
        cmp     x0,#0
        b.eq    .Lenc_key_abort
        cmp     x2,#0
        b.eq    .Lenc_key_abort
        mov     x3,#-2
        cmp     w1,#128
        b.lt    .Lenc_key_abort
        cmp     w1,#256
        b.gt    .Lenc_key_abort
        tst     w1,#0x3f
        b.ne    .Lenc_key_abort

        adrp    x3,.Lrcon
        add     x3,x3,#:lo12:.Lrcon
        cmp     w1,#192

        eor     v0.16b,v0.16b,v0.16b
        ld1     {v3.16b},[x0],#16
        mov     w1,#8           // reuse w1
        ld1     {v1.4s,v2.4s},[x3],#32

        b.lt    .Loop128
        b.eq    .L192
        b       .L256

.align  4
.Loop128:
        tbl     v6.16b,{v3.16b},v2.16b
        ext     v5.16b,v0.16b,v3.16b,#12
        st1     {v3.4s},[x2],#16
        aese    v6.16b,v0.16b
        subs    w1,w1,#1

        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v6.16b,v6.16b,v1.16b
        eor     v3.16b,v3.16b,v5.16b
        shl     v1.16b,v1.16b,#1
        eor     v3.16b,v3.16b,v6.16b
        b.ne    .Loop128

        ld1     {v1.4s},[x3]

        tbl     v6.16b,{v3.16b},v2.16b
        ext     v5.16b,v0.16b,v3.16b,#12
        st1     {v3.4s},[x2],#16
        aese    v6.16b,v0.16b

        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v6.16b,v6.16b,v1.16b
        eor     v3.16b,v3.16b,v5.16b
        shl     v1.16b,v1.16b,#1
        eor     v3.16b,v3.16b,v6.16b

        tbl     v6.16b,{v3.16b},v2.16b
        ext     v5.16b,v0.16b,v3.16b,#12
        st1     {v3.4s},[x2],#16
        aese    v6.16b,v0.16b

        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v6.16b,v6.16b,v1.16b
        eor     v3.16b,v3.16b,v5.16b
        eor     v3.16b,v3.16b,v6.16b
        st1     {v3.4s},[x2]
        add     x2,x2,#0x50

        mov     w12,#10
        b       .Ldone

.align  4
.L192:
        ld1     {v4.8b},[x0],#8
        movi    v6.16b,#8                       // borrow v6.16b
        st1     {v3.4s},[x2],#16
        sub     v2.16b,v2.16b,v6.16b    // adjust the mask

.Loop192:
        tbl     v6.16b,{v4.16b},v2.16b
        ext     v5.16b,v0.16b,v3.16b,#12
#ifdef __AARCH64EB__
        st1     {v4.4s},[x2],#16
        sub     x2,x2,#8
#else
        st1     {v4.8b},[x2],#8
#endif
        aese    v6.16b,v0.16b
        subs    w1,w1,#1

        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b

        dup     v5.4s,v3.s[3]
        eor     v5.16b,v5.16b,v4.16b
        eor     v6.16b,v6.16b,v1.16b
        ext     v4.16b,v0.16b,v4.16b,#12
        shl     v1.16b,v1.16b,#1
        eor     v4.16b,v4.16b,v5.16b
        eor     v3.16b,v3.16b,v6.16b
        eor     v4.16b,v4.16b,v6.16b
        st1     {v3.4s},[x2],#16
        b.ne    .Loop192

        mov     w12,#12
        add     x2,x2,#0x20
        b       .Ldone

.align  4
.L256:
        ld1     {v4.16b},[x0]
        mov     w1,#7
        mov     w12,#14
        st1     {v3.4s},[x2],#16

.Loop256:
        tbl     v6.16b,{v4.16b},v2.16b
        ext     v5.16b,v0.16b,v3.16b,#12
        st1     {v4.4s},[x2],#16
        aese    v6.16b,v0.16b
        subs    w1,w1,#1

        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v3.16b,v3.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v6.16b,v6.16b,v1.16b
        eor     v3.16b,v3.16b,v5.16b
        shl     v1.16b,v1.16b,#1
        eor     v3.16b,v3.16b,v6.16b
        st1     {v3.4s},[x2],#16
        b.eq    .Ldone

        dup     v6.4s,v3.s[3]           // just splat
        ext     v5.16b,v0.16b,v4.16b,#12
        aese    v6.16b,v0.16b

        eor     v4.16b,v4.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v4.16b,v4.16b,v5.16b
        ext     v5.16b,v0.16b,v5.16b,#12
        eor     v4.16b,v4.16b,v5.16b

        eor     v4.16b,v4.16b,v6.16b
        b       .Loop256

.Ldone:
        str     w12,[x2]
        mov     x3,#0

.Lenc_key_abort:
        mov     x0,x3                   // return value
        ldr     x29,[sp],#16
        ret
.size   aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key

.globl  aes_v8_set_decrypt_key
.type   aes_v8_set_decrypt_key,%function
.align  5
aes_v8_set_decrypt_key:
        AARCH64_SIGN_LINK_REGISTER
        stp     x29,x30,[sp,#-16]!
        add     x29,sp,#0
        bl      .Lenc_key

        cmp     x0,#0
        b.ne    .Ldec_key_abort

        sub     x2,x2,#240              // restore original x2
        mov     x4,#-16
        add     x0,x2,x12,lsl#4 // end of key schedule

        ld1     {v0.4s},[x2]
        ld1     {v1.4s},[x0]
        st1     {v0.4s},[x0],x4
        st1     {v1.4s},[x2],#16

.Loop_imc:
        ld1     {v0.4s},[x2]
        ld1     {v1.4s},[x0]
        aesimc  v0.16b,v0.16b
        aesimc  v1.16b,v1.16b
        st1     {v0.4s},[x0],x4
        st1     {v1.4s},[x2],#16
        cmp     x0,x2
        b.hi    .Loop_imc

        ld1     {v0.4s},[x2]
        aesimc  v0.16b,v0.16b
        st1     {v0.4s},[x0]

        eor     x0,x0,x0                // return value
.Ldec_key_abort:
        ldp     x29,x30,[sp],#16
        AARCH64_VALIDATE_LINK_REGISTER
        ret
.size   aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl  aes_v8_encrypt
.type   aes_v8_encrypt,%function
.align  5
aes_v8_encrypt:
        AARCH64_VALID_CALL_TARGET
        ldr     w3,[x2,#240]
        ld1     {v0.4s},[x2],#16
        ld1     {v2.16b},[x0]
        sub     w3,w3,#2
        ld1     {v1.4s},[x2],#16

.Loop_enc:
        aese    v2.16b,v0.16b
        aesmc   v2.16b,v2.16b
        ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aese    v2.16b,v1.16b
        aesmc   v2.16b,v2.16b
        ld1     {v1.4s},[x2],#16
        b.gt    .Loop_enc

        aese    v2.16b,v0.16b
        aesmc   v2.16b,v2.16b
        ld1     {v0.4s},[x2]
        aese    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b

        st1     {v2.16b},[x1]
        ret
.size   aes_v8_encrypt,.-aes_v8_encrypt
.globl  aes_v8_decrypt
.type   aes_v8_decrypt,%function
.align  5
aes_v8_decrypt:
        AARCH64_VALID_CALL_TARGET
        ldr     w3,[x2,#240]
        ld1     {v0.4s},[x2],#16
        ld1     {v2.16b},[x0]
        sub     w3,w3,#2
        ld1     {v1.4s},[x2],#16

.Loop_dec:
        aesd    v2.16b,v0.16b
        aesimc  v2.16b,v2.16b
        ld1     {v0.4s},[x2],#16
        subs    w3,w3,#2
        aesd    v2.16b,v1.16b
        aesimc  v2.16b,v2.16b
        ld1     {v1.4s},[x2],#16
        b.gt    .Loop_dec

        aesd    v2.16b,v0.16b
        aesimc  v2.16b,v2.16b
        ld1     {v0.4s},[x2]
        aesd    v2.16b,v1.16b
        eor     v2.16b,v2.16b,v0.16b

        st1     {v2.16b},[x1]
        ret
.size   aes_v8_decrypt,.-aes_v8_decrypt
.globl  aes_v8_ecb_encrypt
.type   aes_v8_ecb_encrypt,%function
.align  5
aes_v8_ecb_encrypt:
        AARCH64_VALID_CALL_TARGET
        subs    x2,x2,#16
        // Original input data size bigger than 16, jump to big size processing.
        b.ne    .Lecb_big_size
        ld1     {v0.16b},[x0]
        cmp     w4,#0                                   // en- or decrypting?
        ldr     w5,[x3,#240]
        ld1     {v5.4s,v6.4s},[x3],#32                  // load key schedule...

        b.eq    .Lecb_small_dec
        aese    v0.16b,v5.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s,v17.4s},[x3],#32                        // load key schedule...
        aese    v0.16b,v6.16b
        aesmc   v0.16b,v0.16b
        subs    w5,w5,#10                       // if rounds==10, jump to aes-128-ecb processing
        b.eq    .Lecb_128_enc
.Lecb_round_loop:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s},[x3],#16                               // load key schedule...
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        ld1     {v17.4s},[x3],#16                               // load key schedule...
        subs    w5,w5,#2                        // bias
        b.gt    .Lecb_round_loop
.Lecb_128_enc:
        ld1     {v18.4s,v19.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        ld1     {v20.4s,v21.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        ld1     {v22.4s,v23.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        ld1     {v7.4s},[x3]
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v23.16b
        eor     v0.16b,v0.16b,v7.16b
        st1     {v0.16b},[x1]
        b       .Lecb_Final_abort
.Lecb_small_dec:
        aesd    v0.16b,v5.16b
        aesimc  v0.16b,v0.16b
        ld1     {v16.4s,v17.4s},[x3],#32                        // load key schedule...
        aesd    v0.16b,v6.16b
        aesimc  v0.16b,v0.16b
        subs    w5,w5,#10                       // bias
        b.eq    .Lecb_128_dec
.Lecb_dec_round_loop:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        ld1     {v16.4s},[x3],#16                               // load key schedule...
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        ld1     {v17.4s},[x3],#16                               // load key schedule...
        subs    w5,w5,#2                        // bias
        b.gt    .Lecb_dec_round_loop
.Lecb_128_dec:
        ld1     {v18.4s,v19.4s},[x3],#32                // load key schedule...
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        ld1     {v20.4s,v21.4s},[x3],#32                // load key schedule...
        aesd    v0.16b,v18.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v19.16b
        aesimc  v0.16b,v0.16b
        ld1     {v22.4s,v23.4s},[x3],#32                // load key schedule...
        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        ld1     {v7.4s},[x3]
        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v23.16b
        eor     v0.16b,v0.16b,v7.16b
        st1     {v0.16b},[x1]
        b       .Lecb_Final_abort
.Lecb_big_size:
        stp     x29,x30,[sp,#-16]!
        add     x29,sp,#0
        mov     x8,#16
        b.lo    .Lecb_done
        csel    x8,xzr,x8,eq

        cmp     w4,#0                                   // en- or decrypting?
        ldr     w5,[x3,#240]
        and     x2,x2,#-16
        ld1     {v0.16b},[x0],x8

        ld1     {v16.4s,v17.4s},[x3]                            // load key schedule...
        sub     w5,w5,#6
        add     x7,x3,x5,lsl#4                          // pointer to last 7 round keys
        sub     w5,w5,#2
        ld1     {v18.4s,v19.4s},[x7],#32
        ld1     {v20.4s,v21.4s},[x7],#32
        ld1     {v22.4s,v23.4s},[x7],#32
        ld1     {v7.4s},[x7]

        add     x7,x3,#32
        mov     w6,w5
        b.eq    .Lecb_dec

        ld1     {v1.16b},[x0],#16
        subs    x2,x2,#32                               // bias
        add     w6,w5,#2
        orr     v3.16b,v1.16b,v1.16b
        orr     v24.16b,v1.16b,v1.16b
        orr     v1.16b,v0.16b,v0.16b
        b.lo    .Lecb_enc_tail

        orr     v1.16b,v3.16b,v3.16b
        ld1     {v24.16b},[x0],#16
        cmp     x2,#32
        b.lo    .Loop3x_ecb_enc

        ld1     {v25.16b},[x0],#16
        ld1     {v26.16b},[x0],#16
        sub     x2,x2,#32                               // bias
        mov     w6,w5

.Loop5x_ecb_enc:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v16.16b
        aesmc   v26.16b,v26.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v17.16b
        aesmc   v26.16b,v26.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop5x_ecb_enc

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v16.16b
        aesmc   v26.16b,v26.16b
        cmp     x2,#0x40                                        // because .Lecb_enc_tail4x
        sub     x2,x2,#0x50

        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v17.16b
        aesmc   v26.16b,v26.16b
        csel    x6,xzr,x2,gt                    // borrow x6, w6, "gt" is not typo
        mov     x7,x3

        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v18.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v18.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v18.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v18.16b
        aesmc   v26.16b,v26.16b
        add     x0,x0,x6                                // x0 is adjusted in such way that
                                                        // at exit from the loop v1.16b-v26.16b
                                                        // are loaded with last "words"
        add     x6,x2,#0x60                 // because .Lecb_enc_tail4x

        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v19.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v19.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v19.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v19.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v20.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v20.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v21.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v21.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v22.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v22.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v23.16b
        ld1     {v2.16b},[x0],#16
        aese    v1.16b,v23.16b
        ld1     {v3.16b},[x0],#16
        aese    v24.16b,v23.16b
        ld1     {v27.16b},[x0],#16
        aese    v25.16b,v23.16b
        ld1     {v28.16b},[x0],#16
        aese    v26.16b,v23.16b
        ld1     {v29.16b},[x0],#16
        cbz     x6,.Lecb_enc_tail4x
        ld1     {v16.4s},[x7],#16                       // re-pre-load rndkey[0]
        eor     v4.16b,v7.16b,v0.16b
        orr     v0.16b,v2.16b,v2.16b
        eor     v5.16b,v7.16b,v1.16b
        orr     v1.16b,v3.16b,v3.16b
        eor     v17.16b,v7.16b,v24.16b
        orr     v24.16b,v27.16b,v27.16b
        eor     v30.16b,v7.16b,v25.16b
        orr     v25.16b,v28.16b,v28.16b
        eor     v31.16b,v7.16b,v26.16b
        st1     {v4.16b},[x1],#16
        orr     v26.16b,v29.16b,v29.16b
        st1     {v5.16b},[x1],#16
        mov     w6,w5
        st1     {v17.16b},[x1],#16
        ld1     {v17.4s},[x7],#16                       // re-pre-load rndkey[1]
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16
        b.hs    .Loop5x_ecb_enc

        add     x2,x2,#0x50
        cbz     x2,.Lecb_done

        add     w6,w5,#2
        subs    x2,x2,#0x30
        orr     v0.16b,v27.16b,v27.16b
        orr     v1.16b,v28.16b,v28.16b
        orr     v24.16b,v29.16b,v29.16b
        b.lo    .Lecb_enc_tail

        b       .Loop3x_ecb_enc

.align  4
.Lecb_enc_tail4x:
        eor     v5.16b,v7.16b,v1.16b
        eor     v17.16b,v7.16b,v24.16b
        eor     v30.16b,v7.16b,v25.16b
        eor     v31.16b,v7.16b,v26.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16

        b       .Lecb_done
.align  4
.Loop3x_ecb_enc:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_ecb_enc

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        subs    x2,x2,#0x30
        csel    x6,x2,x6,lo                             // x6, w6, is zero at this point
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        add     x0,x0,x6                        // x0 is adjusted in such way that
                                                // at exit from the loop v1.16b-v24.16b
                                                // are loaded with last "words"
        mov     x7,x3
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        ld1     {v2.16b},[x0],#16
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        ld1     {v3.16b},[x0],#16
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        ld1     {v27.16b},[x0],#16
        aese    v0.16b,v23.16b
        aese    v1.16b,v23.16b
        aese    v24.16b,v23.16b
        ld1     {v16.4s},[x7],#16               // re-pre-load rndkey[0]
        add     w6,w5,#2
        eor     v4.16b,v7.16b,v0.16b
        eor     v5.16b,v7.16b,v1.16b
        eor     v24.16b,v24.16b,v7.16b
        ld1     {v17.4s},[x7],#16               // re-pre-load rndkey[1]
        st1     {v4.16b},[x1],#16
        orr     v0.16b,v2.16b,v2.16b
        st1     {v5.16b},[x1],#16
        orr     v1.16b,v3.16b,v3.16b
        st1     {v24.16b},[x1],#16
        orr     v24.16b,v27.16b,v27.16b
        b.hs    .Loop3x_ecb_enc

        cmn     x2,#0x30
        b.eq    .Lecb_done
        nop

.Lecb_enc_tail:
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lecb_enc_tail

        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        cmn     x2,#0x20
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v23.16b
        aese    v24.16b,v23.16b
        b.eq    .Lecb_enc_one
        eor     v5.16b,v7.16b,v1.16b
        eor     v17.16b,v7.16b,v24.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        b       .Lecb_done

.Lecb_enc_one:
        eor     v5.16b,v7.16b,v24.16b
        st1     {v5.16b},[x1],#16
        b       .Lecb_done
.align  5
.Lecb_dec:
        ld1     {v1.16b},[x0],#16
        subs    x2,x2,#32                       // bias
        add     w6,w5,#2
        orr     v3.16b,v1.16b,v1.16b
        orr     v24.16b,v1.16b,v1.16b
        orr     v1.16b,v0.16b,v0.16b
        b.lo    .Lecb_dec_tail

        orr     v1.16b,v3.16b,v3.16b
        ld1     {v24.16b},[x0],#16
        cmp     x2,#32
        b.lo    .Loop3x_ecb_dec

        ld1     {v25.16b},[x0],#16
        ld1     {v26.16b},[x0],#16
        sub     x2,x2,#32                               // bias
        mov     w6,w5

.Loop5x_ecb_dec:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop5x_ecb_dec

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        cmp     x2,#0x40                                // because .Lecb_tail4x
        sub     x2,x2,#0x50

        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        csel    x6,xzr,x2,gt            // borrow x6, w6, "gt" is not typo
        mov     x7,x3

        aesd    v0.16b,v18.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v18.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v18.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v18.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v18.16b
        aesimc  v26.16b,v26.16b
        add     x0,x0,x6                                // x0 is adjusted in such way that
                                                        // at exit from the loop v1.16b-v26.16b
                                                        // are loaded with last "words"
        add     x6,x2,#0x60                     // because .Lecb_tail4x

        aesd    v0.16b,v19.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v19.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v19.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v19.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v19.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v20.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v20.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v21.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v21.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v22.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v22.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v23.16b
        ld1     {v2.16b},[x0],#16
        aesd    v1.16b,v23.16b
        ld1     {v3.16b},[x0],#16
        aesd    v24.16b,v23.16b
        ld1     {v27.16b},[x0],#16
        aesd    v25.16b,v23.16b
        ld1     {v28.16b},[x0],#16
        aesd    v26.16b,v23.16b
        ld1     {v29.16b},[x0],#16
        cbz     x6,.Lecb_tail4x
        ld1     {v16.4s},[x7],#16                       // re-pre-load rndkey[0]
        eor     v4.16b,v7.16b,v0.16b
        orr     v0.16b,v2.16b,v2.16b
        eor     v5.16b,v7.16b,v1.16b
        orr     v1.16b,v3.16b,v3.16b
        eor     v17.16b,v7.16b,v24.16b
        orr     v24.16b,v27.16b,v27.16b
        eor     v30.16b,v7.16b,v25.16b
        orr     v25.16b,v28.16b,v28.16b
        eor     v31.16b,v7.16b,v26.16b
        st1     {v4.16b},[x1],#16
        orr     v26.16b,v29.16b,v29.16b
        st1     {v5.16b},[x1],#16
        mov     w6,w5
        st1     {v17.16b},[x1],#16
        ld1     {v17.4s},[x7],#16                       // re-pre-load rndkey[1]
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16
        b.hs    .Loop5x_ecb_dec

        add     x2,x2,#0x50
        cbz     x2,.Lecb_done

        add     w6,w5,#2
        subs    x2,x2,#0x30
        orr     v0.16b,v27.16b,v27.16b
        orr     v1.16b,v28.16b,v28.16b
        orr     v24.16b,v29.16b,v29.16b
        b.lo    .Lecb_dec_tail

        b       .Loop3x_ecb_dec

.align  4
.Lecb_tail4x:
        eor     v5.16b,v7.16b,v1.16b
        eor     v17.16b,v7.16b,v24.16b
        eor     v30.16b,v7.16b,v25.16b
        eor     v31.16b,v7.16b,v26.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16

        b       .Lecb_done
.align  4
.Loop3x_ecb_dec:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_ecb_dec

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        subs    x2,x2,#0x30
        csel    x6,x2,x6,lo                             // x6, w6, is zero at this point
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        add     x0,x0,x6                        // x0 is adjusted in such way that
                                                // at exit from the loop v1.16b-v24.16b
                                                // are loaded with last "words"
        mov     x7,x3
        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        ld1     {v2.16b},[x0],#16
        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        ld1     {v3.16b},[x0],#16
        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        ld1     {v27.16b},[x0],#16
        aesd    v0.16b,v23.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        ld1     {v16.4s},[x7],#16                       // re-pre-load rndkey[0]
        add     w6,w5,#2
        eor     v4.16b,v7.16b,v0.16b
        eor     v5.16b,v7.16b,v1.16b
        eor     v24.16b,v24.16b,v7.16b
        ld1     {v17.4s},[x7],#16                       // re-pre-load rndkey[1]
        st1     {v4.16b},[x1],#16
        orr     v0.16b,v2.16b,v2.16b
        st1     {v5.16b},[x1],#16
        orr     v1.16b,v3.16b,v3.16b
        st1     {v24.16b},[x1],#16
        orr     v24.16b,v27.16b,v27.16b
        b.hs    .Loop3x_ecb_dec

        cmn     x2,#0x30
        b.eq    .Lecb_done
        nop

.Lecb_dec_tail:
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lecb_dec_tail

        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        cmn     x2,#0x20
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        b.eq    .Lecb_dec_one
        eor     v5.16b,v7.16b,v1.16b
        eor     v17.16b,v7.16b,v24.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        b       .Lecb_done

.Lecb_dec_one:
        eor     v5.16b,v7.16b,v24.16b
        st1     {v5.16b},[x1],#16

.Lecb_done:
        ldr     x29,[sp],#16
.Lecb_Final_abort:
        ret
.size   aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt
.globl  aes_v8_cbc_encrypt
.type   aes_v8_cbc_encrypt,%function
.align  5
aes_v8_cbc_encrypt:
        AARCH64_VALID_CALL_TARGET
        // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
        stp     x29,x30,[sp,#-16]!
        add     x29,sp,#0
        subs    x2,x2,#16
        mov     x8,#16
        b.lo    .Lcbc_abort
        csel    x8,xzr,x8,eq

        cmp     w5,#0                   // en- or decrypting?
        ldr     w5,[x3,#240]
        and     x2,x2,#-16
        ld1     {v6.16b},[x4]
        ld1     {v0.16b},[x0],x8

        ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
        sub     w5,w5,#6
        add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
        sub     w5,w5,#2
        ld1     {v18.4s,v19.4s},[x7],#32
        ld1     {v20.4s,v21.4s},[x7],#32
        ld1     {v22.4s,v23.4s},[x7],#32
        ld1     {v7.4s},[x7]

        add     x7,x3,#32
        mov     w6,w5
        b.eq    .Lcbc_dec

        cmp     w5,#2
        eor     v0.16b,v0.16b,v6.16b
        eor     v5.16b,v16.16b,v7.16b
        b.eq    .Lcbc_enc128

        ld1     {v2.4s,v3.4s},[x7]
        add     x7,x3,#16
        add     x6,x3,#16*4
        add     x12,x3,#16*5
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        add     x14,x3,#16*6
        add     x3,x3,#16*7
        b       .Lenter_cbc_enc

.align  4
.Loop_cbc_enc:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        st1     {v6.16b},[x1],#16
.Lenter_cbc_enc:
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v2.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s},[x6]
        cmp     w5,#4
        aese    v0.16b,v3.16b
        aesmc   v0.16b,v0.16b
        ld1     {v17.4s},[x12]
        b.eq    .Lcbc_enc192

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s},[x14]
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        ld1     {v17.4s},[x3]
        nop

.Lcbc_enc192:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        subs    x2,x2,#16
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        csel    x8,xzr,x8,eq
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.16b},[x0],x8
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        eor     v16.16b,v16.16b,v5.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        ld1     {v17.4s},[x7]           // re-pre-load rndkey[1]
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v23.16b
        eor     v6.16b,v0.16b,v7.16b
        b.hs    .Loop_cbc_enc

        st1     {v6.16b},[x1],#16
        b       .Lcbc_done

.align  5
.Lcbc_enc128:
        ld1     {v2.4s,v3.4s},[x7]
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        b       .Lenter_cbc_enc128
.Loop_cbc_enc128:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        st1     {v6.16b},[x1],#16
.Lenter_cbc_enc128:
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        subs    x2,x2,#16
        aese    v0.16b,v2.16b
        aesmc   v0.16b,v0.16b
        csel    x8,xzr,x8,eq
        aese    v0.16b,v3.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.16b},[x0],x8
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        eor     v16.16b,v16.16b,v5.16b
        aese    v0.16b,v23.16b
        eor     v6.16b,v0.16b,v7.16b
        b.hs    .Loop_cbc_enc128

        st1     {v6.16b},[x1],#16
        b       .Lcbc_done
.align  5
.Lcbc_dec:
        ld1     {v24.16b},[x0],#16
        subs    x2,x2,#32               // bias
        add     w6,w5,#2
        orr     v3.16b,v0.16b,v0.16b
        orr     v1.16b,v0.16b,v0.16b
        orr     v27.16b,v24.16b,v24.16b
        b.lo    .Lcbc_dec_tail

        orr     v1.16b,v24.16b,v24.16b
        ld1     {v24.16b},[x0],#16
        orr     v2.16b,v0.16b,v0.16b
        orr     v3.16b,v1.16b,v1.16b
        orr     v27.16b,v24.16b,v24.16b
        cmp     x2,#32
        b.lo    .Loop3x_cbc_dec

        ld1     {v25.16b},[x0],#16
        ld1     {v26.16b},[x0],#16
        sub     x2,x2,#32               // bias
        mov     w6,w5
        orr     v28.16b,v25.16b,v25.16b
        orr     v29.16b,v26.16b,v26.16b

.Loop5x_cbc_dec:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop5x_cbc_dec

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        cmp     x2,#0x40                // because .Lcbc_tail4x
        sub     x2,x2,#0x50

        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        csel    x6,xzr,x2,gt            // borrow x6, w6, "gt" is not typo
        mov     x7,x3

        aesd    v0.16b,v18.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v18.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v18.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v18.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v18.16b
        aesimc  v26.16b,v26.16b
        add     x0,x0,x6                // x0 is adjusted in such way that
                                        // at exit from the loop v1.16b-v26.16b
                                        // are loaded with last "words"
        add     x6,x2,#0x60             // because .Lcbc_tail4x

        aesd    v0.16b,v19.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v19.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v19.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v19.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v19.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v20.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v20.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v21.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v21.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v22.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v22.16b
        aesimc  v26.16b,v26.16b

        eor     v4.16b,v6.16b,v7.16b
        aesd    v0.16b,v23.16b
        eor     v5.16b,v2.16b,v7.16b
        ld1     {v2.16b},[x0],#16
        aesd    v1.16b,v23.16b
        eor     v17.16b,v3.16b,v7.16b
        ld1     {v3.16b},[x0],#16
        aesd    v24.16b,v23.16b
        eor     v30.16b,v27.16b,v7.16b
        ld1     {v27.16b},[x0],#16
        aesd    v25.16b,v23.16b
        eor     v31.16b,v28.16b,v7.16b
        ld1     {v28.16b},[x0],#16
        aesd    v26.16b,v23.16b
        orr     v6.16b,v29.16b,v29.16b
        ld1     {v29.16b},[x0],#16
        cbz     x6,.Lcbc_tail4x
        ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
        eor     v4.16b,v4.16b,v0.16b
        orr     v0.16b,v2.16b,v2.16b
        eor     v5.16b,v5.16b,v1.16b
        orr     v1.16b,v3.16b,v3.16b
        eor     v17.16b,v17.16b,v24.16b
        orr     v24.16b,v27.16b,v27.16b
        eor     v30.16b,v30.16b,v25.16b
        orr     v25.16b,v28.16b,v28.16b
        eor     v31.16b,v31.16b,v26.16b
        st1     {v4.16b},[x1],#16
        orr     v26.16b,v29.16b,v29.16b
        st1     {v5.16b},[x1],#16
        mov     w6,w5
        st1     {v17.16b},[x1],#16
        ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16
        b.hs    .Loop5x_cbc_dec

        add     x2,x2,#0x50
        cbz     x2,.Lcbc_done

        add     w6,w5,#2
        subs    x2,x2,#0x30
        orr     v0.16b,v27.16b,v27.16b
        orr     v2.16b,v27.16b,v27.16b
        orr     v1.16b,v28.16b,v28.16b
        orr     v3.16b,v28.16b,v28.16b
        orr     v24.16b,v29.16b,v29.16b
        orr     v27.16b,v29.16b,v29.16b
        b.lo    .Lcbc_dec_tail

        b       .Loop3x_cbc_dec

.align  4
.Lcbc_tail4x:
        eor     v5.16b,v4.16b,v1.16b
        eor     v17.16b,v17.16b,v24.16b
        eor     v30.16b,v30.16b,v25.16b
        eor     v31.16b,v31.16b,v26.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16

        b       .Lcbc_done
.align  4
.Loop3x_cbc_dec:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_cbc_dec

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        eor     v4.16b,v6.16b,v7.16b
        subs    x2,x2,#0x30
        eor     v5.16b,v2.16b,v7.16b
        csel    x6,x2,x6,lo                     // x6, w6, is zero at this point
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        eor     v17.16b,v3.16b,v7.16b
        add     x0,x0,x6                // x0 is adjusted in such way that
                                        // at exit from the loop v1.16b-v24.16b
                                        // are loaded with last "words"
        orr     v6.16b,v27.16b,v27.16b
        mov     x7,x3
        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        ld1     {v2.16b},[x0],#16
        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        ld1     {v3.16b},[x0],#16
        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        ld1     {v27.16b},[x0],#16
        aesd    v0.16b,v23.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
        add     w6,w5,#2
        eor     v4.16b,v4.16b,v0.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v24.16b,v24.16b,v17.16b
        ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
        st1     {v4.16b},[x1],#16
        orr     v0.16b,v2.16b,v2.16b
        st1     {v5.16b},[x1],#16
        orr     v1.16b,v3.16b,v3.16b
        st1     {v24.16b},[x1],#16
        orr     v24.16b,v27.16b,v27.16b
        b.hs    .Loop3x_cbc_dec

        cmn     x2,#0x30
        b.eq    .Lcbc_done
        nop

.Lcbc_dec_tail:
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lcbc_dec_tail

        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        cmn     x2,#0x20
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        eor     v5.16b,v6.16b,v7.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        eor     v17.16b,v3.16b,v7.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        b.eq    .Lcbc_dec_one
        eor     v5.16b,v5.16b,v1.16b
        eor     v17.16b,v17.16b,v24.16b
        orr     v6.16b,v27.16b,v27.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        b       .Lcbc_done

.Lcbc_dec_one:
        eor     v5.16b,v5.16b,v24.16b
        orr     v6.16b,v27.16b,v27.16b
        st1     {v5.16b},[x1],#16

.Lcbc_done:
        st1     {v6.16b},[x4]
.Lcbc_abort:
        ldr     x29,[sp],#16
        ret
.size   aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl  aes_v8_ctr32_encrypt_blocks_unroll12_eor3
.type   aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function
.align  5
aes_v8_ctr32_encrypt_blocks_unroll12_eor3:
        AARCH64_VALID_CALL_TARGET
        // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
        stp     x29,x30,[sp,#-80]!
        stp     d8,d9,[sp, #16]
        stp     d10,d11,[sp, #32]
        stp     d12,d13,[sp, #48]
        stp     d14,d15,[sp, #64]
        add     x29,sp,#0

        ldr     w5,[x3,#240]

        ldr     w8, [x4, #12]
#ifdef __AARCH64EB__
        ld1     {v24.16b},[x4]
#else
        ld1     {v24.4s},[x4]
#endif
        ld1     {v2.4s,v3.4s},[x3]              // load key schedule...
        sub     w5,w5,#4
        cmp     x2,#2
        add     x7,x3,x5,lsl#4  // pointer to last round key
        sub     w5,w5,#2
        add     x7, x7, #64
        ld1     {v1.4s},[x7]
        add     x7,x3,#32
        mov     w6,w5
#ifndef __AARCH64EB__
        rev     w8, w8
#endif

        orr     v25.16b,v24.16b,v24.16b
        add     w10, w8, #1
        orr     v26.16b,v24.16b,v24.16b
        add     w8, w8, #2
        orr     v0.16b,v24.16b,v24.16b
        rev     w10, w10
        mov     v25.s[3],w10
        b.ls    .Lctr32_tail_unroll
        cmp     x2,#6
        rev     w12, w8
        sub     x2,x2,#3                // bias
        mov     v26.s[3],w12
        b.lo    .Loop3x_ctr32_unroll
        cmp     x2,#9
        orr     v27.16b,v24.16b,v24.16b
        add     w11, w8, #1
        orr     v28.16b,v24.16b,v24.16b
        add     w13, w8, #2
        rev     w11, w11
        orr     v29.16b,v24.16b,v24.16b
        add     w8, w8, #3
        rev     w13, w13
        mov     v27.s[3],w11
        rev     w14, w8
        mov     v28.s[3],w13
        mov     v29.s[3],w14
        sub     x2,x2,#3
        b.lo    .Loop6x_ctr32_unroll

        // push regs to stack when 12 data chunks are interleaved
        stp     x19,x20,[sp,#-16]!
        stp     x21,x22,[sp,#-16]!
        stp     x23,x24,[sp,#-16]!
        stp     d8,d9,[sp,#-32]!
        stp     d10,d11,[sp,#-32]!

        add     w15,w8,#1
        add     w19,w8,#2
        add     w20,w8,#3
        add     w21,w8,#4
        add     w22,w8,#5
        add     w8,w8,#6
        orr     v30.16b,v24.16b,v24.16b
        rev     w15,w15
        orr     v31.16b,v24.16b,v24.16b
        rev     w19,w19
        orr     v8.16b,v24.16b,v24.16b
        rev     w20,w20
        orr     v9.16b,v24.16b,v24.16b
        rev     w21,w21
        orr     v10.16b,v24.16b,v24.16b
        rev     w22,w22
        orr     v11.16b,v24.16b,v24.16b
        rev     w23,w8

        sub     x2,x2,#6                // bias
        mov     v30.s[3],w15
        mov     v31.s[3],w19
        mov     v8.s[3],w20
        mov     v9.s[3],w21
        mov     v10.s[3],w22
        mov     v11.s[3],w23
        b       .Loop12x_ctr32_unroll

.align  4
.Loop12x_ctr32_unroll:
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v2.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v2.16b
        aesmc   v31.16b,v31.16b
        aese    v8.16b,v2.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        ld1     {v2.4s},[x7],#16
        subs    w6,w6,#2
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v3.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v3.16b
        aesmc   v31.16b,v31.16b
        aese    v8.16b,v3.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v3.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v3.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v3.16b
        aesmc   v11.16b,v11.16b
        ld1     {v3.4s},[x7],#16
        b.gt    .Loop12x_ctr32_unroll

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v2.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v2.16b
        aesmc   v31.16b,v31.16b
        aese    v8.16b,v2.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        ld1     {v2.4s},[x7],#16

        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v3.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v3.16b
        aesmc   v31.16b,v31.16b
        aese    v8.16b,v3.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v3.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v3.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v3.16b
        aesmc   v11.16b,v11.16b
        ld1     {v3.4s},[x7],#16

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        add     w9,w8,#1
        add     w10,w8,#2
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        add     w12,w8,#3
        add     w11,w8,#4
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        add     w13,w8,#5
        add     w14,w8,#6
        rev     w9,w9
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        add     w15,w8,#7
        add     w19,w8,#8
        rev     w10,w10
        rev     w12,w12
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        add     w20,w8,#9
        add     w21,w8,#10
        rev     w11,w11
        rev     w13,w13
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        add     w22,w8,#11
        add     w23,w8,#12
        rev     w14,w14
        rev     w15,w15
        aese    v30.16b,v2.16b
        aesmc   v30.16b,v30.16b
        rev     w19,w19
        rev     w20,w20
        aese    v31.16b,v2.16b
        aesmc   v31.16b,v31.16b
        rev     w21,w21
        rev     w22,w22
        aese    v8.16b,v2.16b
        aesmc   v8.16b,v8.16b
        rev     w23,w23
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        ld1     {v2.4s},[x7],#16

        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v3.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v3.16b
        aesmc   v31.16b,v31.16b
        ld1     {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
        aese    v8.16b,v3.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v3.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v3.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v3.16b
        aesmc   v11.16b,v11.16b
        ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
        ld1     {v3.4s},[x7],#16

        mov     x7, x3
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        aese    v30.16b,v2.16b
        aesmc   v30.16b,v30.16b
        aese    v31.16b,v2.16b
        aesmc   v31.16b,v31.16b
        aese    v8.16b,v2.16b
        aesmc   v8.16b,v8.16b
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        ld1     {v2.4s},[x7],#16        // re-pre-load rndkey[0]

        aese    v24.16b,v3.16b
.inst   0xce016084      //eor3 v4.16b,v4.16b,v1.16b,v24.16b
        orr     v24.16b,v0.16b,v0.16b
        aese    v25.16b,v3.16b
.inst   0xce0164a5      //eor3 v5.16b,v5.16b,v1.16b,v25.16b
        orr     v25.16b,v0.16b,v0.16b
        aese    v26.16b,v3.16b
.inst   0xce0168c6      //eor3 v6.16b,v6.16b,v1.16b,v26.16b
        orr     v26.16b,v0.16b,v0.16b
        aese    v27.16b,v3.16b
.inst   0xce016ce7      //eor3 v7.16b,v7.16b,v1.16b,v27.16b
        orr     v27.16b,v0.16b,v0.16b
        aese    v28.16b,v3.16b
.inst   0xce017210      //eor3 v16.16b,v16.16b,v1.16b,v28.16b
        orr     v28.16b,v0.16b,v0.16b
        aese    v29.16b,v3.16b
.inst   0xce017631      //eor3 v17.16b,v17.16b,v1.16b,v29.16b
        orr     v29.16b,v0.16b,v0.16b
        aese    v30.16b,v3.16b
.inst   0xce017a52      //eor3 v18.16b,v18.16b,v1.16b,v30.16b
        orr     v30.16b,v0.16b,v0.16b
        aese    v31.16b,v3.16b
.inst   0xce017e73      //eor3 v19.16b,v19.16b,v1.16b,v31.16b
        orr     v31.16b,v0.16b,v0.16b
        aese    v8.16b,v3.16b
.inst   0xce012294      //eor3 v20.16b,v20.16b,v1.16b,v8.16b
        orr     v8.16b,v0.16b,v0.16b
        aese    v9.16b,v3.16b
.inst   0xce0126b5      //eor3 v21.16b,v21.16b,v1.16b,v9.16b
        orr     v9.16b,v0.16b,v0.16b
        aese    v10.16b,v3.16b
.inst   0xce012ad6      //eor3 v22.16b,v22.16b,v1.16b,v10.16b
        orr     v10.16b,v0.16b,v0.16b
        aese    v11.16b,v3.16b
.inst   0xce012ef7      //eor3 v23.16b,v23.16b,v1.16b,v11.16b
        orr     v11.16b,v0.16b,v0.16b
        ld1     {v3.4s},[x7],#16        // re-pre-load rndkey[1]

        mov     v24.s[3],w9
        mov     v25.s[3],w10
        mov     v26.s[3],w12
        mov     v27.s[3],w11
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
        mov     v28.s[3],w13
        mov     v29.s[3],w14
        mov     v30.s[3],w15
        mov     v31.s[3],w19
        st1     {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
        mov     v8.s[3],w20
        mov     v9.s[3],w21
        mov     v10.s[3],w22
        mov     v11.s[3],w23
        st1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64

        mov     w6,w5

        add     w8,w8,#12
        subs    x2,x2,#12
        b.hs    .Loop12x_ctr32_unroll

        // pop regs from stack when 12 data chunks are interleaved
        ldp     d10,d11,[sp],#32
        ldp     d8,d9,[sp],#32
        ldp     x23,x24,[sp],#16
        ldp     x21,x22,[sp],#16
        ldp     x19,x20,[sp],#16

        add     x2,x2,#12
        cbz     x2,.Lctr32_done_unroll
        sub     w8,w8,#12

        cmp     x2,#2
        b.ls    .Lctr32_tail_unroll

        cmp     x2,#6
        sub     x2,x2,#3                // bias
        add     w8,w8,#3
        b.lo    .Loop3x_ctr32_unroll

        sub     x2,x2,#3
        add     w8,w8,#3
        b.lo    .Loop6x_ctr32_unroll

.align  4
.Loop6x_ctr32_unroll:
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        ld1     {v2.4s},[x7],#16
        subs    w6,w6,#2
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        ld1     {v3.4s},[x7],#16
        b.gt    .Loop6x_ctr32_unroll

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        ld1     {v2.4s},[x7],#16

        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        ld1     {v3.4s},[x7],#16

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        add     w9,w8,#1
        add     w10,w8,#2
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        add     w12,w8,#3
        add     w11,w8,#4
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        add     w13,w8,#5
        add     w14,w8,#6
        rev     w9,w9
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        rev     w10,w10
        rev     w12,w12
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        rev     w11,w11
        rev     w13,w13
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        rev     w14,w14
        ld1     {v2.4s},[x7],#16

        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        ld1     {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v3.16b
        aesmc   v27.16b,v27.16b
        ld1     {v16.16b,v17.16b},[x0],#32
        aese    v28.16b,v3.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v3.16b
        aesmc   v29.16b,v29.16b
        ld1     {v3.4s},[x7],#16

        mov     x7, x3
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        aese    v27.16b,v2.16b
        aesmc   v27.16b,v27.16b
        aese    v28.16b,v2.16b
        aesmc   v28.16b,v28.16b
        aese    v29.16b,v2.16b
        aesmc   v29.16b,v29.16b
        ld1     {v2.4s},[x7],#16        // re-pre-load rndkey[0]

        aese    v24.16b,v3.16b
.inst   0xce016084      //eor3 v4.16b,v4.16b,v1.16b,v24.16b
        aese    v25.16b,v3.16b
.inst   0xce0164a5      //eor3 v5.16b,v5.16b,v1.16b,v25.16b
        aese    v26.16b,v3.16b
.inst   0xce0168c6      //eor3 v6.16b,v6.16b,v1.16b,v26.16b
        aese    v27.16b,v3.16b
.inst   0xce016ce7      //eor3 v7.16b,v7.16b,v1.16b,v27.16b
        aese    v28.16b,v3.16b
.inst   0xce017210      //eor3 v16.16b,v16.16b,v1.16b,v28.16b
        aese    v29.16b,v3.16b
.inst   0xce017631      //eor3 v17.16b,v17.16b,v1.16b,v29.16b
        ld1     {v3.4s},[x7],#16        // re-pre-load rndkey[1]

        orr     v24.16b,v0.16b,v0.16b
        orr     v25.16b,v0.16b,v0.16b
        orr     v26.16b,v0.16b,v0.16b
        orr     v27.16b,v0.16b,v0.16b
        orr     v28.16b,v0.16b,v0.16b
        orr     v29.16b,v0.16b,v0.16b

        mov     v24.s[3],w9
        mov     v25.s[3],w10
        st1     {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
        mov     v26.s[3],w12
        mov     v27.s[3],w11
        st1     {v16.16b,v17.16b},[x1],#32
        mov     v28.s[3],w13
        mov     v29.s[3],w14

        cbz     x2,.Lctr32_done_unroll
        mov     w6,w5

        cmp     x2,#2
        b.ls    .Lctr32_tail_unroll

        sub     x2,x2,#3                // bias
        add     w8,w8,#3
        b       .Loop3x_ctr32_unroll

.align  4
.Loop3x_ctr32_unroll:
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        ld1     {v2.4s},[x7],#16
        subs    w6,w6,#2
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v3.16b
        aesmc   v26.16b,v26.16b
        ld1     {v3.4s},[x7],#16
        b.gt    .Loop3x_ctr32_unroll

        aese    v24.16b,v2.16b
        aesmc   v9.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v10.16b,v25.16b
        ld1     {v4.16b,v5.16b,v6.16b},[x0],#48
        orr     v24.16b,v0.16b,v0.16b
        aese    v26.16b,v2.16b
        aesmc   v26.16b,v26.16b
        ld1     {v2.4s},[x7],#16
        orr     v25.16b,v0.16b,v0.16b
        aese    v9.16b,v3.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v3.16b
        aesmc   v10.16b,v10.16b
        aese    v26.16b,v3.16b
        aesmc   v11.16b,v26.16b
        ld1     {v3.4s},[x7],#16
        orr     v26.16b,v0.16b,v0.16b
        add     w9,w8,#1
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        add     w10,w8,#2
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        ld1     {v2.4s},[x7],#16
        add     w8,w8,#3
        aese    v9.16b,v3.16b
        aesmc   v9.16b,v9.16b
        aese    v10.16b,v3.16b
        aesmc   v10.16b,v10.16b

        rev     w9,w9
        aese    v11.16b,v3.16b
        aesmc   v11.16b,v11.16b
        ld1     {v3.4s},[x7],#16
        mov     v24.s[3], w9
        mov     x7,x3
        rev     w10,w10
        aese    v9.16b,v2.16b
        aesmc   v9.16b,v9.16b

        aese    v10.16b,v2.16b
        aesmc   v10.16b,v10.16b
        mov     v25.s[3], w10
        rev     w12,w8
        aese    v11.16b,v2.16b
        aesmc   v11.16b,v11.16b
        mov     v26.s[3], w12

        aese    v9.16b,v3.16b
        aese    v10.16b,v3.16b
        aese    v11.16b,v3.16b

.inst   0xce012484      //eor3 v4.16b,v4.16b,v1.16b,v9.16b
        ld1     {v2.4s},[x7],#16        // re-pre-load rndkey[0]
.inst   0xce0128a5      //eor3 v5.16b,v5.16b,v1.16b,v10.16b
        mov     w6,w5
.inst   0xce012cc6      //eor3 v6.16b,v6.16b,v1.16b,v11.16b
        ld1     {v3.4s},[x7],#16        // re-pre-load rndkey[1]
        st1     {v4.16b,v5.16b,v6.16b},[x1],#48

        cbz     x2,.Lctr32_done_unroll

.Lctr32_tail_unroll:
        cmp     x2,#1
        b.eq    .Lctr32_tail_1_unroll

.Lctr32_tail_2_unroll:
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        ld1     {v2.4s},[x7],#16
        subs    w6,w6,#2
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        ld1     {v3.4s},[x7],#16
        b.gt    .Lctr32_tail_2_unroll

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        ld1     {v2.4s},[x7],#16
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        ld1     {v3.4s},[x7],#16
        ld1     {v4.16b,v5.16b},[x0],#32
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        ld1     {v2.4s},[x7],#16
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v3.16b
        aesmc   v25.16b,v25.16b
        ld1     {v3.4s},[x7],#16
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v2.16b
        aesmc   v25.16b,v25.16b
        aese    v24.16b,v3.16b
        aese    v25.16b,v3.16b

.inst   0xce016084      //eor3 v4.16b,v4.16b,v1.16b,v24.16b
.inst   0xce0164a5      //eor3 v5.16b,v5.16b,v1.16b,v25.16b
        st1     {v4.16b,v5.16b},[x1],#32
        b       .Lctr32_done_unroll

.Lctr32_tail_1_unroll:
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        ld1     {v2.4s},[x7],#16
        subs    w6,w6,#2
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        ld1     {v3.4s},[x7],#16
        b.gt    .Lctr32_tail_1_unroll

        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        ld1     {v2.4s},[x7],#16
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        ld1     {v3.4s},[x7],#16
        ld1     {v4.16b},[x0]
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        ld1     {v2.4s},[x7],#16
        aese    v24.16b,v3.16b
        aesmc   v24.16b,v24.16b
        ld1     {v3.4s},[x7],#16
        aese    v24.16b,v2.16b
        aesmc   v24.16b,v24.16b
        aese    v24.16b,v3.16b

.inst   0xce016084      //eor3 v4.16b,v4.16b,v1.16b,v24.16b
        st1     {v4.16b},[x1],#16

.Lctr32_done_unroll:
        ldp     d8,d9,[sp, #16]
        ldp     d10,d11,[sp, #32]
        ldp     d12,d13,[sp, #48]
        ldp     d14,d15,[sp, #64]
        ldr     x29,[sp],#80
        ret
.size   aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3
.globl  aes_v8_ctr32_encrypt_blocks
.type   aes_v8_ctr32_encrypt_blocks,%function
.align  5
aes_v8_ctr32_encrypt_blocks:
        AARCH64_VALID_CALL_TARGET
        // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
        stp     x29,x30,[sp,#-16]!
        add     x29,sp,#0
        ldr     w5,[x3,#240]

        ldr     w8, [x4, #12]
#ifdef __AARCH64EB__
        ld1     {v0.16b},[x4]
#else
        ld1     {v0.4s},[x4]
#endif
        ld1     {v16.4s,v17.4s},[x3]            // load key schedule...
        sub     w5,w5,#4
        mov     x12,#16
        cmp     x2,#2
        add     x7,x3,x5,lsl#4  // pointer to last 5 round keys
        sub     w5,w5,#2
        ld1     {v20.4s,v21.4s},[x7],#32
        ld1     {v22.4s,v23.4s},[x7],#32
        ld1     {v7.4s},[x7]
        add     x7,x3,#32
        mov     w6,w5
        csel    x12,xzr,x12,lo
#ifndef __AARCH64EB__
        rev     w8, w8
#endif
        orr     v1.16b,v0.16b,v0.16b
        add     w10, w8, #1
        orr     v18.16b,v0.16b,v0.16b
        add     w8, w8, #2
        orr     v6.16b,v0.16b,v0.16b
        rev     w10, w10
        mov     v1.s[3],w10
        b.ls    .Lctr32_tail
        rev     w12, w8
        sub     x2,x2,#3                // bias
        mov     v18.s[3],w12
        cmp     x2,#32
        b.lo    .Loop3x_ctr32

        add     w13,w8,#1
        add     w14,w8,#2
        orr     v24.16b,v0.16b,v0.16b
        rev     w13,w13
        orr     v25.16b,v0.16b,v0.16b
        rev     w14,w14
        mov     v24.s[3],w13
        sub     x2,x2,#2                // bias
        mov     v25.s[3],w14
        add     w8,w8,#2
        b       .Loop5x_ctr32

.align  4
.Loop5x_ctr32:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v16.16b
        aesmc   v18.16b,v18.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v17.16b
        aesmc   v18.16b,v18.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop5x_ctr32

        mov     x7,x3
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v16.16b
        aesmc   v18.16b,v18.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]

        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v17.16b
        aesmc   v18.16b,v18.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]

        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        add     w9,w8,#1
        add     w10,w8,#2
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        add     w12,w8,#3
        add     w13,w8,#4
        aese    v18.16b,v20.16b
        aesmc   v18.16b,v18.16b
        add     w14,w8,#5
        rev     w9,w9
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        rev     w10,w10
        rev     w12,w12
        aese    v25.16b,v20.16b
        aesmc   v25.16b,v25.16b
        rev     w13,w13
        rev     w14,w14

        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v21.16b
        aesmc   v18.16b,v18.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v21.16b
        aesmc   v25.16b,v25.16b

        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        ld1     {v2.16b},[x0],#16
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        ld1     {v3.16b},[x0],#16
        aese    v18.16b,v22.16b
        aesmc   v18.16b,v18.16b
        ld1     {v19.16b},[x0],#16
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        ld1     {v26.16b},[x0],#16
        aese    v25.16b,v22.16b
        aesmc   v25.16b,v25.16b
        ld1     {v27.16b},[x0],#16

        aese    v0.16b,v23.16b
        eor     v2.16b,v2.16b,v7.16b
        aese    v1.16b,v23.16b
        eor     v3.16b,v3.16b,v7.16b
        aese    v18.16b,v23.16b
        eor     v19.16b,v19.16b,v7.16b
        aese    v24.16b,v23.16b
        eor     v26.16b,v26.16b,v7.16b
        aese    v25.16b,v23.16b
        eor     v27.16b,v27.16b,v7.16b

        eor     v2.16b,v2.16b,v0.16b
        orr     v0.16b,v6.16b,v6.16b
        eor     v3.16b,v3.16b,v1.16b
        orr     v1.16b,v6.16b,v6.16b
        eor     v19.16b,v19.16b,v18.16b
        orr     v18.16b,v6.16b,v6.16b
        eor     v26.16b,v26.16b,v24.16b
        orr     v24.16b,v6.16b,v6.16b
        eor     v27.16b,v27.16b,v25.16b
        orr     v25.16b,v6.16b,v6.16b

        st1     {v2.16b},[x1],#16
        mov     v0.s[3],w9
        st1     {v3.16b},[x1],#16
        mov     v1.s[3],w10
        st1     {v19.16b},[x1],#16
        mov     v18.s[3],w12
        st1     {v26.16b},[x1],#16
        mov     v24.s[3],w13
        st1     {v27.16b},[x1],#16
        mov     v25.s[3],w14

        mov     w6,w5
        cbz     x2,.Lctr32_done

        add     w8,w8,#5
        subs    x2,x2,#5
        b.hs    .Loop5x_ctr32

        add     x2,x2,#5
        sub     w8,w8,#5

        cmp     x2,#2
        mov     x12,#16
        csel    x12,xzr,x12,lo
        b.ls    .Lctr32_tail

        sub     x2,x2,#3                // bias
        add     w8,w8,#3
        b       .Loop3x_ctr32

.align  4
.Loop3x_ctr32:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v16.16b
        aesmc   v18.16b,v18.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v18.16b,v17.16b
        aesmc   v18.16b,v18.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop3x_ctr32

        aese    v0.16b,v16.16b
        aesmc   v4.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v5.16b,v1.16b
        ld1     {v2.16b},[x0],#16
        orr     v0.16b,v6.16b,v6.16b
        aese    v18.16b,v16.16b
        aesmc   v18.16b,v18.16b
        ld1     {v3.16b},[x0],#16
        orr     v1.16b,v6.16b,v6.16b
        aese    v4.16b,v17.16b
        aesmc   v4.16b,v4.16b
        aese    v5.16b,v17.16b
        aesmc   v5.16b,v5.16b
        ld1     {v19.16b},[x0],#16
        mov     x7,x3
        aese    v18.16b,v17.16b
        aesmc   v17.16b,v18.16b
        orr     v18.16b,v6.16b,v6.16b
        add     w9,w8,#1
        aese    v4.16b,v20.16b
        aesmc   v4.16b,v4.16b
        aese    v5.16b,v20.16b
        aesmc   v5.16b,v5.16b
        eor     v2.16b,v2.16b,v7.16b
        add     w10,w8,#2
        aese    v17.16b,v20.16b
        aesmc   v17.16b,v17.16b
        eor     v3.16b,v3.16b,v7.16b
        add     w8,w8,#3
        aese    v4.16b,v21.16b
        aesmc   v4.16b,v4.16b
        aese    v5.16b,v21.16b
        aesmc   v5.16b,v5.16b
        eor     v19.16b,v19.16b,v7.16b
        rev     w9,w9
        aese    v17.16b,v21.16b
        aesmc   v17.16b,v17.16b
        mov     v0.s[3], w9
        rev     w10,w10
        aese    v4.16b,v22.16b
        aesmc   v4.16b,v4.16b
        aese    v5.16b,v22.16b
        aesmc   v5.16b,v5.16b
        mov     v1.s[3], w10
        rev     w12,w8
        aese    v17.16b,v22.16b
        aesmc   v17.16b,v17.16b
        mov     v18.s[3], w12
        subs    x2,x2,#3
        aese    v4.16b,v23.16b
        aese    v5.16b,v23.16b
        aese    v17.16b,v23.16b

        eor     v2.16b,v2.16b,v4.16b
        ld1     {v16.4s},[x7],#16       // re-pre-load rndkey[0]
        st1     {v2.16b},[x1],#16
        eor     v3.16b,v3.16b,v5.16b
        mov     w6,w5
        st1     {v3.16b},[x1],#16
        eor     v19.16b,v19.16b,v17.16b
        ld1     {v17.4s},[x7],#16       // re-pre-load rndkey[1]
        st1     {v19.16b},[x1],#16
        b.hs    .Loop3x_ctr32

        adds    x2,x2,#3
        b.eq    .Lctr32_done
        cmp     x2,#1
        mov     x12,#16
        csel    x12,xzr,x12,eq

.Lctr32_tail:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lctr32_tail

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        ld1     {v2.16b},[x0],x12
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        ld1     {v3.16b},[x0]
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        eor     v2.16b,v2.16b,v7.16b
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        eor     v3.16b,v3.16b,v7.16b
        aese    v0.16b,v23.16b
        aese    v1.16b,v23.16b

        cmp     x2,#1
        eor     v2.16b,v2.16b,v0.16b
        eor     v3.16b,v3.16b,v1.16b
        st1     {v2.16b},[x1],#16
        b.eq    .Lctr32_done
        st1     {v3.16b},[x1]

.Lctr32_done:
        ldr     x29,[sp],#16
        ret
.size   aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
.globl  aes_v8_xts_encrypt
.type   aes_v8_xts_encrypt,%function
.align  5
aes_v8_xts_encrypt:
        AARCH64_VALID_CALL_TARGET
        cmp     x2,#16
        // Original input data size bigger than 16, jump to big size processing.
        b.ne    .Lxts_enc_big_size
        // Encrypt the iv with key2, as the first XEX iv.
        ldr     w6,[x4,#240]
        ld1     {v0.4s},[x4],#16
        ld1     {v6.16b},[x5]
        sub     w6,w6,#2
        ld1     {v1.4s},[x4],#16

.Loop_enc_iv_enc:
        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4],#16
        subs    w6,w6,#2
        aese    v6.16b,v1.16b
        aesmc   v6.16b,v6.16b
        ld1     {v1.4s},[x4],#16
        b.gt    .Loop_enc_iv_enc

        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4]
        aese    v6.16b,v1.16b
        eor     v6.16b,v6.16b,v0.16b

        ld1     {v0.16b},[x0]
        eor     v0.16b,v6.16b,v0.16b

        ldr     w6,[x3,#240]
        ld1     {v28.4s,v29.4s},[x3],#32                // load key schedule...

        aese    v0.16b,v28.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s,v17.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v29.16b
        aesmc   v0.16b,v0.16b
        subs    w6,w6,#10               // if rounds==10, jump to aes-128-xts processing
        b.eq    .Lxts_128_enc
.Lxts_enc_round_loop:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        ld1     {v16.4s},[x3],#16               // load key schedule...
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        ld1     {v17.4s},[x3],#16               // load key schedule...
        subs    w6,w6,#2                // bias
        b.gt    .Lxts_enc_round_loop
.Lxts_128_enc:
        ld1     {v18.4s,v19.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        ld1     {v20.4s,v21.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        ld1     {v22.4s,v23.4s},[x3],#32                // load key schedule...
        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        ld1     {v7.4s},[x3]
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v0.16b,v23.16b
        eor     v0.16b,v0.16b,v7.16b
        eor     v0.16b,v0.16b,v6.16b
        st1     {v0.16b},[x1]
        b       .Lxts_enc_final_abort

.align  4
.Lxts_enc_big_size:
        stp     x19,x20,[sp,#-64]!
        stp     x21,x22,[sp,#48]
        stp     d8,d9,[sp,#32]
        stp     d10,d11,[sp,#16]

        // tailcnt store the tail value of length%16.
        and     x21,x2,#0xf
        and     x2,x2,#-16
        subs    x2,x2,#16
        mov     x8,#16
        b.lo    .Lxts_abort
        csel    x8,xzr,x8,eq

        // Firstly, encrypt the iv with key2, as the first iv of XEX.
        ldr     w6,[x4,#240]
        ld1     {v0.4s},[x4],#16
        ld1     {v6.16b},[x5]
        sub     w6,w6,#2
        ld1     {v1.4s},[x4],#16

.Loop_iv_enc:
        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4],#16
        subs    w6,w6,#2
        aese    v6.16b,v1.16b
        aesmc   v6.16b,v6.16b
        ld1     {v1.4s},[x4],#16
        b.gt    .Loop_iv_enc

        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4]
        aese    v6.16b,v1.16b
        eor     v6.16b,v6.16b,v0.16b

        // The iv for second block
        // x9- iv(low), x10 - iv(high)
        // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
        fmov    x9,d6
        fmov    x10,v6.d[1]
        mov     w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d8,x9
        fmov    v8.d[1],x10

        ldr     w5,[x3,#240]            // next starting point
        ld1     {v0.16b},[x0],x8

        ld1     {v16.4s,v17.4s},[x3]                    // load key schedule...
        sub     w5,w5,#6
        add     x7,x3,x5,lsl#4          // pointer to last 7 round keys
        sub     w5,w5,#2
        ld1     {v18.4s,v19.4s},[x7],#32
        ld1     {v20.4s,v21.4s},[x7],#32
        ld1     {v22.4s,v23.4s},[x7],#32
        ld1     {v7.4s},[x7]

        add     x7,x3,#32
        mov     w6,w5

        // Encryption
.Lxts_enc:
        ld1     {v24.16b},[x0],#16
        subs    x2,x2,#32                       // bias
        add     w6,w5,#2
        orr     v3.16b,v0.16b,v0.16b
        orr     v1.16b,v0.16b,v0.16b
        orr     v28.16b,v0.16b,v0.16b
        orr     v27.16b,v24.16b,v24.16b
        orr     v29.16b,v24.16b,v24.16b
        b.lo    .Lxts_inner_enc_tail
        eor     v0.16b,v0.16b,v6.16b                    // before encryption, xor with iv
        eor     v24.16b,v24.16b,v8.16b

        // The iv for third block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d9,x9
        fmov    v9.d[1],x10


        orr     v1.16b,v24.16b,v24.16b
        ld1     {v24.16b},[x0],#16
        orr     v2.16b,v0.16b,v0.16b
        orr     v3.16b,v1.16b,v1.16b
        eor     v27.16b,v24.16b,v9.16b          // the third block
        eor     v24.16b,v24.16b,v9.16b
        cmp     x2,#32
        b.lo    .Lxts_outer_enc_tail

        // The iv for fourth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d10,x9
        fmov    v10.d[1],x10

        ld1     {v25.16b},[x0],#16
        // The iv for fifth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d11,x9
        fmov    v11.d[1],x10

        ld1     {v26.16b},[x0],#16
        eor     v25.16b,v25.16b,v10.16b         // the fourth block
        eor     v26.16b,v26.16b,v11.16b
        sub     x2,x2,#32                       // bias
        mov     w6,w5
        b       .Loop5x_xts_enc

.align  4
.Loop5x_xts_enc:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v16.16b
        aesmc   v26.16b,v26.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v17.16b
        aesmc   v26.16b,v26.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Loop5x_xts_enc

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v16.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v16.16b
        aesmc   v26.16b,v26.16b
        subs    x2,x2,#0x50                     // because .Lxts_enc_tail4x

        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v17.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v17.16b
        aesmc   v26.16b,v26.16b
        csel    x6,xzr,x2,gt            // borrow x6, w6, "gt" is not typo
        mov     x7,x3

        aese    v0.16b,v18.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v18.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v18.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v18.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v18.16b
        aesmc   v26.16b,v26.16b
        add     x0,x0,x6                // x0 is adjusted in such way that
                                                // at exit from the loop v1.16b-v26.16b
                                                // are loaded with last "words"
        add     x6,x2,#0x60             // because .Lxts_enc_tail4x

        aese    v0.16b,v19.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v19.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v19.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v19.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v19.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v20.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v20.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v21.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v21.16b
        aesmc   v26.16b,v26.16b

        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        aese    v25.16b,v22.16b
        aesmc   v25.16b,v25.16b
        aese    v26.16b,v22.16b
        aesmc   v26.16b,v26.16b

        eor     v4.16b,v7.16b,v6.16b
        aese    v0.16b,v23.16b
        // The iv for first block of one iteration
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d6,x9
        fmov    v6.d[1],x10
        eor     v5.16b,v7.16b,v8.16b
        ld1     {v2.16b},[x0],#16
        aese    v1.16b,v23.16b
        // The iv for second block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d8,x9
        fmov    v8.d[1],x10
        eor     v17.16b,v7.16b,v9.16b
        ld1     {v3.16b},[x0],#16
        aese    v24.16b,v23.16b
        // The iv for third block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d9,x9
        fmov    v9.d[1],x10
        eor     v30.16b,v7.16b,v10.16b
        ld1     {v27.16b},[x0],#16
        aese    v25.16b,v23.16b
        // The iv for fourth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d10,x9
        fmov    v10.d[1],x10
        eor     v31.16b,v7.16b,v11.16b
        ld1     {v28.16b},[x0],#16
        aese    v26.16b,v23.16b

        // The iv for fifth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d11,x9
        fmov    v11.d[1],x10

        ld1     {v29.16b},[x0],#16
        cbz     x6,.Lxts_enc_tail4x
        ld1     {v16.4s},[x7],#16               // re-pre-load rndkey[0]
        eor     v4.16b,v4.16b,v0.16b
        eor     v0.16b,v2.16b,v6.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v1.16b,v3.16b,v8.16b
        eor     v17.16b,v17.16b,v24.16b
        eor     v24.16b,v27.16b,v9.16b
        eor     v30.16b,v30.16b,v25.16b
        eor     v25.16b,v28.16b,v10.16b
        eor     v31.16b,v31.16b,v26.16b
        st1     {v4.16b},[x1],#16
        eor     v26.16b,v29.16b,v11.16b
        st1     {v5.16b},[x1],#16
        mov     w6,w5
        st1     {v17.16b},[x1],#16
        ld1     {v17.4s},[x7],#16               // re-pre-load rndkey[1]
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16
        b.hs    .Loop5x_xts_enc


        // If left 4 blocks, borrow the five block's processing.
        cmn     x2,#0x10
        b.ne    .Loop5x_enc_after
        orr     v11.16b,v10.16b,v10.16b
        orr     v10.16b,v9.16b,v9.16b
        orr     v9.16b,v8.16b,v8.16b
        orr     v8.16b,v6.16b,v6.16b
        fmov    x9,d11
        fmov    x10,v11.d[1]
        eor     v0.16b,v6.16b,v2.16b
        eor     v1.16b,v8.16b,v3.16b
        eor     v24.16b,v27.16b,v9.16b
        eor     v25.16b,v28.16b,v10.16b
        eor     v26.16b,v29.16b,v11.16b
        b.eq    .Loop5x_xts_enc

.Loop5x_enc_after:
        add     x2,x2,#0x50
        cbz     x2,.Lxts_enc_done

        add     w6,w5,#2
        subs    x2,x2,#0x30
        b.lo    .Lxts_inner_enc_tail

        eor     v0.16b,v6.16b,v27.16b
        eor     v1.16b,v8.16b,v28.16b
        eor     v24.16b,v29.16b,v9.16b
        b       .Lxts_outer_enc_tail

.align  4
.Lxts_enc_tail4x:
        add     x0,x0,#16
        eor     v5.16b,v1.16b,v5.16b
        st1     {v5.16b},[x1],#16
        eor     v17.16b,v24.16b,v17.16b
        st1     {v17.16b},[x1],#16
        eor     v30.16b,v25.16b,v30.16b
        eor     v31.16b,v26.16b,v31.16b
        st1     {v30.16b,v31.16b},[x1],#32

        b       .Lxts_enc_done
.align  4
.Lxts_outer_enc_tail:
        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lxts_outer_enc_tail

        aese    v0.16b,v16.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        eor     v4.16b,v6.16b,v7.16b
        subs    x2,x2,#0x30
        // The iv for first block
        fmov    x9,d9
        fmov    x10,v9.d[1]
        //mov   w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr#31
        eor     x9,x11,x9,lsl#1
        fmov    d6,x9
        fmov    v6.d[1],x10
        eor     v5.16b,v8.16b,v7.16b
        csel    x6,x2,x6,lo       // x6, w6, is zero at this point
        aese    v0.16b,v17.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        eor     v17.16b,v9.16b,v7.16b

        add     x6,x6,#0x20
        add     x0,x0,x6
        mov     x7,x3

        aese    v0.16b,v20.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        aese    v0.16b,v21.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        aese    v0.16b,v22.16b
        aesmc   v0.16b,v0.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        aese    v0.16b,v23.16b
        aese    v1.16b,v23.16b
        aese    v24.16b,v23.16b
        ld1     {v27.16b},[x0],#16
        add     w6,w5,#2
        ld1     {v16.4s},[x7],#16                // re-pre-load rndkey[0]
        eor     v4.16b,v4.16b,v0.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v24.16b,v24.16b,v17.16b
        ld1     {v17.4s},[x7],#16                // re-pre-load rndkey[1]
        st1     {v4.16b},[x1],#16
        st1     {v5.16b},[x1],#16
        st1     {v24.16b},[x1],#16
        cmn     x2,#0x30
        b.eq    .Lxts_enc_done
.Lxts_encxor_one:
        orr     v28.16b,v3.16b,v3.16b
        orr     v29.16b,v27.16b,v27.16b
        nop

.Lxts_inner_enc_tail:
        cmn     x2,#0x10
        eor     v1.16b,v28.16b,v6.16b
        eor     v24.16b,v29.16b,v8.16b
        b.eq    .Lxts_enc_tail_loop
        eor     v24.16b,v29.16b,v6.16b
.Lxts_enc_tail_loop:
        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lxts_enc_tail_loop

        aese    v1.16b,v16.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v16.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v17.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v17.16b
        aesmc   v24.16b,v24.16b
        aese    v1.16b,v20.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v20.16b
        aesmc   v24.16b,v24.16b
        cmn     x2,#0x20
        aese    v1.16b,v21.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v21.16b
        aesmc   v24.16b,v24.16b
        eor     v5.16b,v6.16b,v7.16b
        aese    v1.16b,v22.16b
        aesmc   v1.16b,v1.16b
        aese    v24.16b,v22.16b
        aesmc   v24.16b,v24.16b
        eor     v17.16b,v8.16b,v7.16b
        aese    v1.16b,v23.16b
        aese    v24.16b,v23.16b
        b.eq    .Lxts_enc_one
        eor     v5.16b,v5.16b,v1.16b
        st1     {v5.16b},[x1],#16
        eor     v17.16b,v17.16b,v24.16b
        orr     v6.16b,v8.16b,v8.16b
        st1     {v17.16b},[x1],#16
        fmov    x9,d8
        fmov    x10,v8.d[1]
        mov     w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d6,x9
        fmov    v6.d[1],x10
        b       .Lxts_enc_done

.Lxts_enc_one:
        eor     v5.16b,v5.16b,v24.16b
        orr     v6.16b,v6.16b,v6.16b
        st1     {v5.16b},[x1],#16
        fmov    x9,d6
        fmov    x10,v6.d[1]
        mov     w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d6,x9
        fmov    v6.d[1],x10
        b       .Lxts_enc_done
.align  5
.Lxts_enc_done:
        // Process the tail block with cipher stealing.
        tst     x21,#0xf
        b.eq    .Lxts_abort

        mov     x20,x0
        mov     x13,x1
        sub     x1,x1,#16
.composite_enc_loop:
        subs    x21,x21,#1
        ldrb    w15,[x1,x21]
        ldrb    w14,[x20,x21]
        strb    w15,[x13,x21]
        strb    w14,[x1,x21]
        b.gt    .composite_enc_loop
.Lxts_enc_load_done:
        ld1     {v26.16b},[x1]
        eor     v26.16b,v26.16b,v6.16b

        // Encrypt the composite block to get the last second encrypted text block
        ldr     w6,[x3,#240]            // load key schedule...
        ld1     {v0.4s},[x3],#16
        sub     w6,w6,#2
        ld1     {v1.4s},[x3],#16                // load key schedule...
.Loop_final_enc:
        aese    v26.16b,v0.16b
        aesmc   v26.16b,v26.16b
        ld1     {v0.4s},[x3],#16
        subs    w6,w6,#2
        aese    v26.16b,v1.16b
        aesmc   v26.16b,v26.16b
        ld1     {v1.4s},[x3],#16
        b.gt    .Loop_final_enc

        aese    v26.16b,v0.16b
        aesmc   v26.16b,v26.16b
        ld1     {v0.4s},[x3]
        aese    v26.16b,v1.16b
        eor     v26.16b,v26.16b,v0.16b
        eor     v26.16b,v26.16b,v6.16b
        st1     {v26.16b},[x1]

.Lxts_abort:
        ldp     x21,x22,[sp,#48]
        ldp     d8,d9,[sp,#32]
        ldp     d10,d11,[sp,#16]
        ldp     x19,x20,[sp],#64
.Lxts_enc_final_abort:
        ret
.size   aes_v8_xts_encrypt,.-aes_v8_xts_encrypt
.globl  aes_v8_xts_decrypt
.type   aes_v8_xts_decrypt,%function
.align  5
aes_v8_xts_decrypt:
        AARCH64_VALID_CALL_TARGET
        cmp     x2,#16
        // Original input data size bigger than 16, jump to big size processing.
        b.ne    .Lxts_dec_big_size
        // Encrypt the iv with key2, as the first XEX iv.
        ldr     w6,[x4,#240]
        ld1     {v0.4s},[x4],#16
        ld1     {v6.16b},[x5]
        sub     w6,w6,#2
        ld1     {v1.4s},[x4],#16

.Loop_dec_small_iv_enc:
        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4],#16
        subs    w6,w6,#2
        aese    v6.16b,v1.16b
        aesmc   v6.16b,v6.16b
        ld1     {v1.4s},[x4],#16
        b.gt    .Loop_dec_small_iv_enc

        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4]
        aese    v6.16b,v1.16b
        eor     v6.16b,v6.16b,v0.16b

        ld1     {v0.16b},[x0]
        eor     v0.16b,v6.16b,v0.16b

        ldr     w6,[x3,#240]
        ld1     {v28.4s,v29.4s},[x3],#32                        // load key schedule...

        aesd    v0.16b,v28.16b
        aesimc  v0.16b,v0.16b
        ld1     {v16.4s,v17.4s},[x3],#32                        // load key schedule...
        aesd    v0.16b,v29.16b
        aesimc  v0.16b,v0.16b
        subs    w6,w6,#10                       // bias
        b.eq    .Lxts_128_dec
.Lxts_dec_round_loop:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        ld1     {v16.4s},[x3],#16                       // load key schedule...
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        ld1     {v17.4s},[x3],#16                       // load key schedule...
        subs    w6,w6,#2                        // bias
        b.gt    .Lxts_dec_round_loop
.Lxts_128_dec:
        ld1     {v18.4s,v19.4s},[x3],#32                        // load key schedule...
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        ld1     {v20.4s,v21.4s},[x3],#32                        // load key schedule...
        aesd    v0.16b,v18.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v19.16b
        aesimc  v0.16b,v0.16b
        ld1     {v22.4s,v23.4s},[x3],#32                        // load key schedule...
        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        ld1     {v7.4s},[x3]
        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v0.16b,v23.16b
        eor     v0.16b,v0.16b,v7.16b
        eor     v0.16b,v6.16b,v0.16b
        st1     {v0.16b},[x1]
        b       .Lxts_dec_final_abort
.Lxts_dec_big_size:
        stp     x19,x20,[sp,#-64]!
        stp     x21,x22,[sp,#48]
        stp     d8,d9,[sp,#32]
        stp     d10,d11,[sp,#16]

        and     x21,x2,#0xf
        and     x2,x2,#-16
        subs    x2,x2,#16
        mov     x8,#16
        b.lo    .Lxts_dec_abort

        // Encrypt the iv with key2, as the first XEX iv
        ldr     w6,[x4,#240]
        ld1     {v0.4s},[x4],#16
        ld1     {v6.16b},[x5]
        sub     w6,w6,#2
        ld1     {v1.4s},[x4],#16

.Loop_dec_iv_enc:
        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4],#16
        subs    w6,w6,#2
        aese    v6.16b,v1.16b
        aesmc   v6.16b,v6.16b
        ld1     {v1.4s},[x4],#16
        b.gt    .Loop_dec_iv_enc

        aese    v6.16b,v0.16b
        aesmc   v6.16b,v6.16b
        ld1     {v0.4s},[x4]
        aese    v6.16b,v1.16b
        eor     v6.16b,v6.16b,v0.16b

        // The iv for second block
        // x9- iv(low), x10 - iv(high)
        // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
        fmov    x9,d6
        fmov    x10,v6.d[1]
        mov     w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d8,x9
        fmov    v8.d[1],x10

        ldr     w5,[x3,#240]            // load rounds number

        // The iv for third block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d9,x9
        fmov    v9.d[1],x10

        ld1     {v16.4s,v17.4s},[x3]                    // load key schedule...
        sub     w5,w5,#6
        add     x7,x3,x5,lsl#4          // pointer to last 7 round keys
        sub     w5,w5,#2
        ld1     {v18.4s,v19.4s},[x7],#32                // load key schedule...
        ld1     {v20.4s,v21.4s},[x7],#32
        ld1     {v22.4s,v23.4s},[x7],#32
        ld1     {v7.4s},[x7]

        // The iv for fourth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d10,x9
        fmov    v10.d[1],x10

        add     x7,x3,#32
        mov     w6,w5
        b       .Lxts_dec

        // Decryption
.align  5
.Lxts_dec:
        tst     x21,#0xf
        b.eq    .Lxts_dec_begin
        subs    x2,x2,#16
        csel    x8,xzr,x8,eq
        ld1     {v0.16b},[x0],#16
        b.lo    .Lxts_done
        sub     x0,x0,#16
.Lxts_dec_begin:
        ld1     {v0.16b},[x0],x8
        subs    x2,x2,#32                       // bias
        add     w6,w5,#2
        orr     v3.16b,v0.16b,v0.16b
        orr     v1.16b,v0.16b,v0.16b
        orr     v28.16b,v0.16b,v0.16b
        ld1     {v24.16b},[x0],#16
        orr     v27.16b,v24.16b,v24.16b
        orr     v29.16b,v24.16b,v24.16b
        b.lo    .Lxts_inner_dec_tail
        eor     v0.16b,v0.16b,v6.16b                    // before decryt, xor with iv
        eor     v24.16b,v24.16b,v8.16b

        orr     v1.16b,v24.16b,v24.16b
        ld1     {v24.16b},[x0],#16
        orr     v2.16b,v0.16b,v0.16b
        orr     v3.16b,v1.16b,v1.16b
        eor     v27.16b,v24.16b,v9.16b                  // third block xox with third iv
        eor     v24.16b,v24.16b,v9.16b
        cmp     x2,#32
        b.lo    .Lxts_outer_dec_tail

        ld1     {v25.16b},[x0],#16

        // The iv for fifth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d11,x9
        fmov    v11.d[1],x10

        ld1     {v26.16b},[x0],#16
        eor     v25.16b,v25.16b,v10.16b         // the fourth block
        eor     v26.16b,v26.16b,v11.16b
        sub     x2,x2,#32                       // bias
        mov     w6,w5
        b       .Loop5x_xts_dec

.align  4
.Loop5x_xts_dec:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        ld1     {v16.4s},[x7],#16               // load key schedule...
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        ld1     {v17.4s},[x7],#16               // load key schedule...
        b.gt    .Loop5x_xts_dec

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v16.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v16.16b
        aesimc  v26.16b,v26.16b
        subs    x2,x2,#0x50                     // because .Lxts_dec_tail4x

        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v17.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v17.16b
        aesimc  v26.16b,v26.16b
        csel    x6,xzr,x2,gt            // borrow x6, w6, "gt" is not typo
        mov     x7,x3

        aesd    v0.16b,v18.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v18.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v18.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v18.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v18.16b
        aesimc  v26.16b,v26.16b
        add     x0,x0,x6                // x0 is adjusted in such way that
                                                // at exit from the loop v1.16b-v26.16b
                                                // are loaded with last "words"
        add     x6,x2,#0x60             // because .Lxts_dec_tail4x

        aesd    v0.16b,v19.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v19.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v19.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v19.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v19.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v20.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v20.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v21.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v21.16b
        aesimc  v26.16b,v26.16b

        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        aesd    v25.16b,v22.16b
        aesimc  v25.16b,v25.16b
        aesd    v26.16b,v22.16b
        aesimc  v26.16b,v26.16b

        eor     v4.16b,v7.16b,v6.16b
        aesd    v0.16b,v23.16b
        // The iv for first block of next iteration.
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d6,x9
        fmov    v6.d[1],x10
        eor     v5.16b,v7.16b,v8.16b
        ld1     {v2.16b},[x0],#16
        aesd    v1.16b,v23.16b
        // The iv for second block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d8,x9
        fmov    v8.d[1],x10
        eor     v17.16b,v7.16b,v9.16b
        ld1     {v3.16b},[x0],#16
        aesd    v24.16b,v23.16b
        // The iv for third block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d9,x9
        fmov    v9.d[1],x10
        eor     v30.16b,v7.16b,v10.16b
        ld1     {v27.16b},[x0],#16
        aesd    v25.16b,v23.16b
        // The iv for fourth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d10,x9
        fmov    v10.d[1],x10
        eor     v31.16b,v7.16b,v11.16b
        ld1     {v28.16b},[x0],#16
        aesd    v26.16b,v23.16b

        // The iv for fifth block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d11,x9
        fmov    v11.d[1],x10

        ld1     {v29.16b},[x0],#16
        cbz     x6,.Lxts_dec_tail4x
        ld1     {v16.4s},[x7],#16               // re-pre-load rndkey[0]
        eor     v4.16b,v4.16b,v0.16b
        eor     v0.16b,v2.16b,v6.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v1.16b,v3.16b,v8.16b
        eor     v17.16b,v17.16b,v24.16b
        eor     v24.16b,v27.16b,v9.16b
        eor     v30.16b,v30.16b,v25.16b
        eor     v25.16b,v28.16b,v10.16b
        eor     v31.16b,v31.16b,v26.16b
        st1     {v4.16b},[x1],#16
        eor     v26.16b,v29.16b,v11.16b
        st1     {v5.16b},[x1],#16
        mov     w6,w5
        st1     {v17.16b},[x1],#16
        ld1     {v17.4s},[x7],#16               // re-pre-load rndkey[1]
        st1     {v30.16b},[x1],#16
        st1     {v31.16b},[x1],#16
        b.hs    .Loop5x_xts_dec

        cmn     x2,#0x10
        b.ne    .Loop5x_dec_after
        // If x2(x2) equal to -0x10, the left blocks is 4.
        // After specially processing, utilize the five blocks processing again.
        // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
        orr     v11.16b,v10.16b,v10.16b
        orr     v10.16b,v9.16b,v9.16b
        orr     v9.16b,v8.16b,v8.16b
        orr     v8.16b,v6.16b,v6.16b
        fmov    x9,d11
        fmov    x10,v11.d[1]
        eor     v0.16b,v6.16b,v2.16b
        eor     v1.16b,v8.16b,v3.16b
        eor     v24.16b,v27.16b,v9.16b
        eor     v25.16b,v28.16b,v10.16b
        eor     v26.16b,v29.16b,v11.16b
        b.eq    .Loop5x_xts_dec

.Loop5x_dec_after:
        add     x2,x2,#0x50
        cbz     x2,.Lxts_done

        add     w6,w5,#2
        subs    x2,x2,#0x30
        b.lo    .Lxts_inner_dec_tail

        eor     v0.16b,v6.16b,v27.16b
        eor     v1.16b,v8.16b,v28.16b
        eor     v24.16b,v29.16b,v9.16b
        b       .Lxts_outer_dec_tail

.align  4
.Lxts_dec_tail4x:
        add     x0,x0,#16
        tst     x21,#0xf
        eor     v5.16b,v1.16b,v4.16b
        st1     {v5.16b},[x1],#16
        eor     v17.16b,v24.16b,v17.16b
        st1     {v17.16b},[x1],#16
        eor     v30.16b,v25.16b,v30.16b
        eor     v31.16b,v26.16b,v31.16b
        st1     {v30.16b,v31.16b},[x1],#32

        b.eq    .Lxts_dec_abort
        ld1     {v0.16b},[x0],#16
        b       .Lxts_done
.align  4
.Lxts_outer_dec_tail:
        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lxts_outer_dec_tail

        aesd    v0.16b,v16.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        eor     v4.16b,v6.16b,v7.16b
        subs    x2,x2,#0x30
        // The iv for first block
        fmov    x9,d9
        fmov    x10,v9.d[1]
        mov     w19,#0x87
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d6,x9
        fmov    v6.d[1],x10
        eor     v5.16b,v8.16b,v7.16b
        csel    x6,x2,x6,lo     // x6, w6, is zero at this point
        aesd    v0.16b,v17.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        eor     v17.16b,v9.16b,v7.16b
        // The iv for second block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d8,x9
        fmov    v8.d[1],x10

        add     x6,x6,#0x20
        add     x0,x0,x6                // x0 is adjusted to the last data

        mov     x7,x3

        // The iv for third block
        extr    x22,x10,x10,#32
        extr    x10,x10,x9,#63
        and     w11,w19,w22,asr #31
        eor     x9,x11,x9,lsl #1
        fmov    d9,x9
        fmov    v9.d[1],x10

        aesd    v0.16b,v20.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        aesd    v0.16b,v21.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        aesd    v0.16b,v22.16b
        aesimc  v0.16b,v0.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        ld1     {v27.16b},[x0],#16
        aesd    v0.16b,v23.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        ld1     {v16.4s},[x7],#16               // re-pre-load rndkey[0]
        add     w6,w5,#2
        eor     v4.16b,v4.16b,v0.16b
        eor     v5.16b,v5.16b,v1.16b
        eor     v24.16b,v24.16b,v17.16b
        ld1     {v17.4s},[x7],#16               // re-pre-load rndkey[1]
        st1     {v4.16b},[x1],#16
        st1     {v5.16b},[x1],#16
        st1     {v24.16b},[x1],#16

        cmn     x2,#0x30
        add     x2,x2,#0x30
        b.eq    .Lxts_done
        sub     x2,x2,#0x30
        orr     v28.16b,v3.16b,v3.16b
        orr     v29.16b,v27.16b,v27.16b
        nop

.Lxts_inner_dec_tail:
        // x2 == -0x10 means two blocks left.
        cmn     x2,#0x10
        eor     v1.16b,v28.16b,v6.16b
        eor     v24.16b,v29.16b,v8.16b
        b.eq    .Lxts_dec_tail_loop
        eor     v24.16b,v29.16b,v6.16b
.Lxts_dec_tail_loop:
        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        ld1     {v16.4s},[x7],#16
        subs    w6,w6,#2
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        ld1     {v17.4s},[x7],#16
        b.gt    .Lxts_dec_tail_loop

        aesd    v1.16b,v16.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v16.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v17.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v17.16b
        aesimc  v24.16b,v24.16b
        aesd    v1.16b,v20.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v20.16b
        aesimc  v24.16b,v24.16b
        cmn     x2,#0x20
        aesd    v1.16b,v21.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v21.16b
        aesimc  v24.16b,v24.16b
        eor     v5.16b,v6.16b,v7.16b
        aesd    v1.16b,v22.16b
        aesimc  v1.16b,v1.16b
        aesd    v24.16b,v22.16b
        aesimc  v24.16b,v24.16b
        eor     v17.16b,v8.16b,v7.16b
        aesd    v1.16b,v23.16b
        aesd    v24.16b,v23.16b
        b.eq    .Lxts_dec_one
        eor     v5.16b,v5.16b,v1.16b
        eor     v17.16b,v17.16b,v24.16b
        orr     v6.16b,v9.16b,v9.16b
        orr     v8.16b,v10.16b,v10.16b
        st1     {v5.16b},[x1],#16
        st1     {v17.16b},[x1],#16
        add     x2,x2,#16
        b       .Lxts_done

.Lxts_dec_one:
        eor     v5.16b,v5.16b,v24.16b
        orr     v6.16b,v8.16b,v8.16b
        orr     v8.16b,v9.16b,v9.16b
        st1     {v5.16b},[x1],#16
        add     x2,x2,#32

.Lxts_done:
        tst     x21,#0xf
        b.eq    .Lxts_dec_abort
        // Processing the last two blocks with cipher stealing.
        mov     x7,x3
        cbnz    x2,.Lxts_dec_1st_done
        ld1     {v0.16b},[x0],#16

        // Decrypt the last second block to get the last plain text block
.Lxts_dec_1st_done:
        eor     v26.16b,v0.16b,v8.16b
        ldr     w6,[x3,#240]
        ld1     {v0.4s},[x3],#16
        sub     w6,w6,#2
        ld1     {v1.4s},[x3],#16
.Loop_final_2nd_dec:
        aesd    v26.16b,v0.16b
        aesimc  v26.16b,v26.16b
        ld1     {v0.4s},[x3],#16                // load key schedule...
        subs    w6,w6,#2
        aesd    v26.16b,v1.16b
        aesimc  v26.16b,v26.16b
        ld1     {v1.4s},[x3],#16                // load key schedule...
        b.gt    .Loop_final_2nd_dec

        aesd    v26.16b,v0.16b
        aesimc  v26.16b,v26.16b
        ld1     {v0.4s},[x3]
        aesd    v26.16b,v1.16b
        eor     v26.16b,v26.16b,v0.16b
        eor     v26.16b,v26.16b,v8.16b
        st1     {v26.16b},[x1]

        mov     x20,x0
        add     x13,x1,#16

        // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
        // to get the last encrypted block.
.composite_dec_loop:
        subs    x21,x21,#1
        ldrb    w15,[x1,x21]
        ldrb    w14,[x20,x21]
        strb    w15,[x13,x21]
        strb    w14,[x1,x21]
        b.gt    .composite_dec_loop
.Lxts_dec_load_done:
        ld1     {v26.16b},[x1]
        eor     v26.16b,v26.16b,v6.16b

        // Decrypt the composite block to get the last second plain text block
        ldr     w6,[x7,#240]
        ld1     {v0.4s},[x7],#16
        sub     w6,w6,#2
        ld1     {v1.4s},[x7],#16
.Loop_final_dec:
        aesd    v26.16b,v0.16b
        aesimc  v26.16b,v26.16b
        ld1     {v0.4s},[x7],#16                // load key schedule...
        subs    w6,w6,#2
        aesd    v26.16b,v1.16b
        aesimc  v26.16b,v26.16b
        ld1     {v1.4s},[x7],#16                // load key schedule...
        b.gt    .Loop_final_dec

        aesd    v26.16b,v0.16b
        aesimc  v26.16b,v26.16b
        ld1     {v0.4s},[x7]
        aesd    v26.16b,v1.16b
        eor     v26.16b,v26.16b,v0.16b
        eor     v26.16b,v26.16b,v6.16b
        st1     {v26.16b},[x1]

.Lxts_dec_abort:
        ldp     x21,x22,[sp,#48]
        ldp     d8,d9,[sp,#32]
        ldp     d10,d11,[sp,#16]
        ldp     x19,x20,[sp],#64

.Lxts_dec_final_abort:
        ret
.size   aes_v8_xts_decrypt,.-aes_v8_xts_decrypt
#endif
